rm(list = ls())

library('rstan')
library('bayesplot')
library('ggplot2')
library('BART')

bayesplot_theme_set(theme_default(base_size = 24, base_family = "sans"))

prepare data

# edited to add "escs" and "wealth" to the data
data_full <- readRDS("./data/cleaned/cleaned_data.Rds")
# only consider AUS - for better balance between treatment/control groups
data_AUS <- data_full[data_full$country == 'AUS', ]
table(data_AUS$public_private)
## 
## private  public 
##    5210    6596
head(data_AUS)
## # A tibble: 6 × 14
##   country school_id student_id gender computer internet  math  read science
##   <fct>   <fct>     <fct>      <fct>  <fct>    <fct>    <dbl> <dbl>   <dbl>
## 1 AUS     3600002   3600210    female yes      yes       541.  611.    530.
## 2 AUS     3600002   3602562    female yes      yes       431.  469.    454.
## 3 AUS     3600002   3602765    male   yes      yes       617.  642.    659.
## 4 AUS     3600002   3603797    female yes      yes       359.  449.    401.
## 5 AUS     3600002   3606990    female yes      yes       475.  440.    443.
## 6 AUS     3600002   3608618    female yes      yes       436.  498.    420.
## # ℹ 5 more variables: stu_wgt <dbl>, wealth <dbl>, escs <dbl>,
## #   public_private <fct>, family_class <chr>
data <- data_AUS[, c("gender", "computer", "internet", "math", "read", "science", "public_private", "family_class", "wealth", "escs", "stu_wgt")]

# one-hot encoding of binary variables
data$gender <- as.numeric(data$gender == 'female')
data$computer <- as.numeric(data$computer == "yes")
data$internet <- as.numeric(data$internet == "yes")
data$public_private <- as.numeric(data$public_private == "private")
data$family_class <- as.numeric((data$family_class == "middle class"))

# combine "computer" and "internet" into "tech_access"
data$tech_access <- data$computer * data$internet

# log transformation of the scores - will transorm back when making predictions
data$math_log <- log(data$math)
data$read_log <- log(data$read)
data$science_log <- log(data$science)

head(data)
## # A tibble: 6 × 15
##   gender computer internet  math  read science public_private family_class
##    <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>          <dbl>        <dbl>
## 1      1        1        1  541.  611.    530.              1            1
## 2      1        1        1  431.  469.    454.              1            1
## 3      0        1        1  617.  642.    659.              1            1
## 4      1        1        1  359.  449.    401.              1            1
## 5      1        1        1  475.  440.    443.              1            1
## 6      1        1        1  436.  498.    420.              1            1
## # ℹ 7 more variables: wealth <dbl>, escs <dbl>, stu_wgt <dbl>,
## #   tech_access <dbl>, math_log <dbl>, read_log <dbl>, science_log <dbl>
X <- data[, c(1, 8:10, 12)]
X[,"Offset"] <- 1
t <- data$public_private

treat_ind <- data$public_private==1
control_ind <- data$public_private==0

X_treat <- X[treat_ind, ]
X_control <- X[control_ind, ]

head(X_treat)
## # A tibble: 6 × 6
##   gender family_class wealth   escs tech_access Offset
##    <dbl>        <dbl>  <dbl>  <dbl>       <dbl>  <dbl>
## 1      1            1  0.852  0.101           1      1
## 2      1            1  0.664 -0.793           1      1
## 3      0            1 -0.267  0.166           1      1
## 4      1            1  1.31   0.726           1      1
## 5      1            1  0.901 -0.170           1      1
## 6      1            1  1.10   1.21            1      1
head(X_control)
## # A tibble: 6 × 6
##   gender family_class wealth   escs tech_access Offset
##    <dbl>        <dbl>  <dbl>  <dbl>       <dbl>  <dbl>
## 1      1            0  0.168 -0.170           1      1
## 2      0            0 -0.887 -0.337           1      1
## 3      0            1  0.536 -1.13            1      1
## 4      1            0  0.541 -0.898           1      1
## 5      0            1  0.146  0.769           1      1
## 6      0            1 -1.25  -1.41            0      1
element_prod_row <- function(input_matrix, vector){
  row_num = nrow(input_matrix)
  col_num = ncol(input_matrix)
  output_matrix = matrix(nrow=row_num, ncol=col_num)
  for (i in 1:row_num){
    output_matrix[i, ] = input_matrix[i, ] * vector
  }
  output_matrix
}

element_prod_col <- function(input_matrix, vector){
  row_num = nrow(input_matrix)
  col_num = ncol(input_matrix)
  output_matrix = matrix(nrow=row_num, ncol=col_num)
  for (j in 1:col_num){
    output_matrix[, j] = input_matrix[, j] * vector
  }
  output_matrix
}

initialize models

# outcome model 1 - bayesian linear regression
outcome_normal_code = "
data {
int<lower=0> N;      // number of data items
int<lower=0> N_test;   // number of test data items
int<lower=0> K;       // number of predictors
real<lower=0> pr_sd; // std dev of the prior
matrix[N, K] x;       // predictor matrix
real y[N];          // output vector
matrix[N_test, K] x_test;   // predictor matrix
}

parameters {
  vector[K] beta;       // coefficients for predictors
  real<lower=0> sigma2;  // error scale
}

transformed parameters {
  vector[N] mu = x * beta;
  real<lower=0> sigma = sqrt(sigma2);
}

model {
  beta ~ normal(0, pr_sd);  // Note: beta is k-dim
  sigma2 ~ inv_gamma(0.001,0.001);  // Close to a flat prior
  y ~ normal(mu,sigma);  // likelihood
}

generated quantities {
  real y_rep[N_test];
  y_rep = normal_rng(x_test * beta, sigma);
} "

outcome_normal = stan_model(model_code=outcome_normal_code)
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## #include <cmath>
##          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1
# propensity score model
ps_logis_code = "
data {
  int<lower=0> N;      // number of data items
  int<lower=0> K;      // number of predictors
  real<lower=0> pr_sd; // std dev of the prior
  matrix[N, K] x;      // predictor matrix
  int<lower=0, upper=1> y[N];      // Binary outcome variable
}

parameters {
  vector[K] beta;                  // Regression coefficients
}

transformed parameters {
  vector[N] mu = x * beta;
}

model {
  // Prior for beta
  beta ~ normal(0, pr_sd);           
  
  // Likelihood
  y ~ bernoulli_logit(mu);   // Logistic regression likelihood
}

generated quantities {
  int<lower=0, upper=1> y_rep[N];
  y_rep = bernoulli_logit_rng(mu);
} "

ps_logis = stan_model(model_code=ps_logis_code)
## Trying to compile a simple C file
## Running /Library/Frameworks/R.framework/Resources/bin/R CMD SHLIB foo.c
## clang -mmacosx-version-min=10.13 -I"/Library/Frameworks/R.framework/Resources/include" -DNDEBUG   -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/Rcpp/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/unsupported"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/BH/include" -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/src/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppParallel/include/"  -I"/Library/Frameworks/R.framework/Versions/4.2/Resources/library/rstan/include" -DEIGEN_NO_DEBUG  -DBOOST_DISABLE_ASSERTS  -DBOOST_PENDING_INTEGER_LOG2_HPP  -DSTAN_THREADS  -DUSE_STANC3 -DSTRICT_R_HEADERS  -DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION  -D_HAS_AUTO_PTR_ETC=0  -include '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp'  -D_REENTRANT -DRCPP_PARALLEL_USE_TBB=1   -I/usr/local/include   -fPIC  -Wall -g -O2  -c foo.c -o foo.o
## In file included from <built-in>:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/StanHeaders/include/stan/math/prim/fun/Eigen.hpp:22:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Dense:1:
## In file included from /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/Core:19:
## /Library/Frameworks/R.framework/Versions/4.2/Resources/library/RcppEigen/include/Eigen/src/Core/util/Macros.h:679:10: fatal error: 'cmath' file not found
## #include <cmath>
##          ^~~~~~~
## 1 error generated.
## make: *** [foo.o] Error 1

causal estimate

dr_estimate <- function(y, t, y0_pred, y1_pred, ps){
  y_pred = element_prod_row(y0_pred, 1-t) + element_prod_row(y1_pred, t)
  ps_pred = element_prod_row(1-ps, 1-t) + element_prod_row(ps, t) 
  temp1 = y1_pred - y0_pred
  temp2 = y - y_pred
  dr_estimate = temp1 + temp2/ps_pred
  
  dr_estimate
}

propensity score model

ps_data <- list(N = nrow(X), K = ncol(X), pr_sd = 3, x=X, y=t)
ps_fit <- sampling(ps_logis, data=ps_data, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000757 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 7.57 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 40.353 seconds (Warm-up)
## Chain 1:                13.213 seconds (Sampling)
## Chain 1:                53.566 seconds (Total)
## Chain 1:
ps <- 1/(1 + exp(-extract(ps_fit)$mu))
# check mixing 
class(ps_fit)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(ps_fit)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
#hist(post_smp$TV,50)
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

plot(post_smp$wealth, type='l')

plot(post_smp$escs, type='l')

# predictive check on the propensity score model
y_rep = extract(ps_fit)$y_rep
# One posterior predictive check compares the number of 0's and 1's in the observed vs predictive datasets
ppc_bars(y=t,yrep=y_rep)+ggplot2::labs(y = "Number of 0's and 1's in observed and Predictive datasets")

# Another just plots the proportions of 1's
ppc_stat(y=t,yrep=y_rep)+ggplot2::labs(x = "Proportion of 1's in observed and Predictive datasets")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

outcome models

math

y <- data$math_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]

bayesian linear regression

# treatment group: fit model
math_outcome_treat <- list(N = nrow(X_treat),
                           N_test = nrow(X),
                           K = ncol(X_treat),
                           pr_sd = 3,
                           x = X_treat,
                           y = y_treat,
                           x_test = X)
math_outcome_treat_BLR <- sampling(outcome_normal, data=math_outcome_treat, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000384 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.84 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 13.904 seconds (Warm-up)
## Chain 1:                4.411 seconds (Sampling)
## Chain 1:                18.315 seconds (Total)
## Chain 1:
# treatment group: check mixing
class(math_outcome_treat_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(math_outcome_treat_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# treatment group: predictive check
y1_pred <- extract(math_outcome_treat_BLR)$y_rep
 
ppd_intervals(y1_pred[ ,treat_ind], x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

# control group: fit model
math_outcome_control <- list(N = nrow(X_control),
                           N_test = nrow(X),
                           K = ncol(X_control),
                           pr_sd = 3,
                           x = X_control,
                           y = y_control,
                           x_test = X)
math_outcome_control_BLR <- sampling(outcome_normal, data=math_outcome_control, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.00032 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.2 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 14.199 seconds (Warm-up)
## Chain 1:                4.435 seconds (Sampling)
## Chain 1:                18.634 seconds (Total)
## Chain 1:
# control group: check mixing
class(math_outcome_control_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(math_outcome_control_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# control group: predictive check
y0_pred <- extract(math_outcome_control_BLR)$y_rep
 
ppd_intervals(y0_pred[ ,control_ind], x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]),x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

BART

# treatment group: fit model
math_outcome_treat_BART <- wbart(x.train=as.matrix(X_treat[1:5]), 
                            y.train = y_treat, 
                            x.test=as.matrix(X[1:5]),
                            w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.072444, -0.096642
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.032790,3.000000,0.005912
## *****sigma: 0.174221
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 57s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# treatment group: predictive check
y1_pred <- math_outcome_treat_BART$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Math: Predictive check for control group")

# control group: fit model
math_outcome_control_BART <- wbart(x.train=as.matrix(X_control[1:5]), 
                              y.train = y_control, 
                              x.test=as.matrix(X[1:5]), 
                              w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.034944, 0.102562
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.030787,3.000000,0.006994
## *****sigma: 0.189487
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 67s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# control group: predictive check
y0_pred <- math_outcome_control_BART$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Math: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Math: Predictive check for control group")

read

y <- data$read_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]

bayesian linear regression

# treatment group: fit model
read_outcome_treat <- list(N = nrow(X_treat),
                           N_test = nrow(X),
                           K = ncol(X_treat),
                           pr_sd = 3,
                           x = X_treat,
                           y = y_treat,
                           x_test = X)
read_outcome_treat_BLR <- sampling(outcome_normal, data=read_outcome_treat, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000643 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 6.43 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 13.218 seconds (Warm-up)
## Chain 1:                4.045 seconds (Sampling)
## Chain 1:                17.263 seconds (Total)
## Chain 1:
# treatment group: check mixing
class(read_outcome_treat_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(read_outcome_treat_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# treatment group: predictive check
y1_pred <- extract(read_outcome_treat_BLR)$y_rep
 
ppd_intervals(y1_pred[ ,treat_ind], x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

# control group: fit model
read_outcome_control <- list(N = nrow(X_control),
                           N_test = nrow(X),
                           K = ncol(X_control),
                           pr_sd = 3,
                           x = X_control,
                           y = y_control,
                           x_test = X)
read_outcome_control_BLR <- sampling(outcome_normal, data=read_outcome_control, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.001117 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 11.17 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 12.458 seconds (Warm-up)
## Chain 1:                3.865 seconds (Sampling)
## Chain 1:                16.323 seconds (Total)
## Chain 1:
# control group: check mixing
class(read_outcome_control_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(read_outcome_control_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# control group: predictive check
y0_pred <- extract(read_outcome_control_BLR)$y_rep
 
ppd_intervals(y0_pred[ ,control_ind], x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]),x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

BART

# treatment group: fit model
read_outcome_treat_BART <- wbart(x.train=as.matrix(X_treat[1:5]), 
                            y.train = y_treat, 
                            x.test=as.matrix(X[1:5]),
                            w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.171592, -0.356653
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028634,3.000000,0.007594
## *****sigma: 0.197444
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 38s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# treatment group: predictive check
y1_pred <- read_outcome_treat_BART$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

# control group: fit model
read_outcome_control_BART <- wbart(x.train=as.matrix(X_control[1:5]), 
                              y.train = y_control, 
                              x.test=as.matrix(X[1:5]), 
                              w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: 0.158013, -0.005358
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.034070,3.000000,0.009718
## *****sigma: 0.223363
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 48s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# control group: predictive check
y0_pred <- read_outcome_control_BART$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")

science

y <- data$science_log
y_treat <- y[treat_ind]
y_control <- y[control_ind]

bayesian linear regression

# treatment group: fit model
sci_outcome_treat <- list(N = nrow(X_treat),
                           N_test = nrow(X),
                           K = ncol(X_treat),
                           pr_sd = 3,
                           x = X_treat,
                           y = y_treat,
                           x_test = X)
sci_outcome_treat_BLR <- sampling(outcome_normal, data=sci_outcome_treat, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.003155 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 31.55 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 13.331 seconds (Warm-up)
## Chain 1:                3.722 seconds (Sampling)
## Chain 1:                17.053 seconds (Total)
## Chain 1:
# treatment group: check mixing
class(sci_outcome_treat_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(sci_outcome_treat_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# treatment group: predictive check
y1_pred <- extract(sci_outcome_treat_BLR)$y_rep
 
ppd_intervals(y1_pred[ ,treat_ind], x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

# control group: fit model
sci_outcome_control <- list(N = nrow(X_control),
                           N_test = nrow(X),
                           K = ncol(X_control),
                           pr_sd = 3,
                           x = X_control,
                           y = y_control,
                           x_test = X)
sci_outcome_control_BLR <- sampling(outcome_normal, data=sci_outcome_control, iter=5000, warmup=4000, chains=1)
## 
## SAMPLING FOR MODEL 'anon_model' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000325 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.25 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 5000 [  0%]  (Warmup)
## Chain 1: Iteration:  500 / 5000 [ 10%]  (Warmup)
## Chain 1: Iteration: 1000 / 5000 [ 20%]  (Warmup)
## Chain 1: Iteration: 1500 / 5000 [ 30%]  (Warmup)
## Chain 1: Iteration: 2000 / 5000 [ 40%]  (Warmup)
## Chain 1: Iteration: 2500 / 5000 [ 50%]  (Warmup)
## Chain 1: Iteration: 3000 / 5000 [ 60%]  (Warmup)
## Chain 1: Iteration: 3500 / 5000 [ 70%]  (Warmup)
## Chain 1: Iteration: 4000 / 5000 [ 80%]  (Warmup)
## Chain 1: Iteration: 4001 / 5000 [ 80%]  (Sampling)
## Chain 1: Iteration: 4500 / 5000 [ 90%]  (Sampling)
## Chain 1: Iteration: 5000 / 5000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 13.155 seconds (Warm-up)
## Chain 1:                4.073 seconds (Sampling)
## Chain 1:                17.228 seconds (Total)
## Chain 1:
# control group: check mixing
class(sci_outcome_control_BLR)
## [1] "stanfit"
## attr(,"package")
## [1] "rstan"
post_smp <- as.data.frame(sci_outcome_control_BLR)
colnames(post_smp)[1:5] <- colnames(X)
## Warning in colnames(post_smp)[1:5] <- colnames(X): number of items to replace
## is not a multiple of replacement length
mcmc_areas(post_smp[,1:5], pars = colnames(X)[1:5], prob = 0.8)

plot(post_smp$gender, type='l')

plot(post_smp$family_class, type='l')

plot(post_smp$tech_access, type='l')

# control group: predictive check
y0_pred <- extract(sci_outcome_control_BLR)$y_rep
 
ppd_intervals(y0_pred[ ,control_ind], x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
   ggplot2::labs(y = "Predicted y's", x = "Observed y's")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]),x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
   ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's")

BART

# treatment group: fit the model
sci_outcome_treat_BART <- wbart(x.train=as.matrix(X_treat[1:5]), 
                               y.train = y_treat,
                               x.test=as.matrix(X[1:5]),
                               w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 5210, 5, 11806
## y1,yn: 0.030477, -0.223005
## x1,x[n*p]: 1.000000, 0.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.029797,3.000000,0.006803
## *****sigma: 0.186884
## *****w (weights): 9.906390 ... 10.067360
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 39s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# treatment group: predictive check
y1_pred <- sci_outcome_treat_BART$yhat.test

ppd_intervals(y1_pred[ ,treat_ind],x=y[treat_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for treatment group")

ppd_intervals(t(t(y1_pred[ ,treat_ind])-y[treat_ind]),x=y[treat_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

# control group: fit the model
sci_outcome_control_BART <- wbart(x.train=as.matrix(X_control[1:5]), 
                                 y.train = y_control, 
                                 x.test=as.matrix(X[1:5]), 
                                 w=data_AUS$stu_wgt)
## *****Into main of wbart
## *****Data:
## data:n,p,np: 6596, 5, 11806
## y1,yn: -0.079194, -0.131055
## x1,x[n*p]: 1.000000, 1.000000
## xp1,xp[np*p]: 1.000000, 0.000000
## *****Number of Trees: 200
## *****Number of Cut Points: 1 ... 1
## *****burn and ndpost: 100, 1000
## *****Prior:beta,alpha,tau,nu,lambda: 2.000000,0.950000,0.028021,3.000000,0.008300
## *****sigma: 0.206423
## *****w (weights): 9.906390 ... 20.005300
## *****Dirichlet:sparse,theta,omega,a,b,rho,augment: 0,0,1,0.5,1,5,0
## *****nkeeptrain,nkeeptest,nkeeptestme,nkeeptreedraws: 1000,1000,1000,1000
## *****printevery: 100
## *****skiptr,skipte,skipteme,skiptreedraws: 1,1,1,1
## 
## MCMC
## done 0 (out of 1100)
## done 100 (out of 1100)
## done 200 (out of 1100)
## done 300 (out of 1100)
## done 400 (out of 1100)
## done 500 (out of 1100)
## done 600 (out of 1100)
## done 700 (out of 1100)
## done 800 (out of 1100)
## done 900 (out of 1100)
## done 1000 (out of 1100)
## time: 53s
## check counts
## trcnt,tecnt,temecnt,treedrawscnt: 1000,1000,1000,1000
# control group: predictive check
y0_pred <- sci_outcome_control_BART$yhat.test

ppd_intervals(y0_pred[ ,control_ind],x=y[control_ind]) + geom_abline(intercept = 0, slope = 1)  +
  ggplot2::labs(y = "Predicted y's", x = "Observed y's", title="Read: Predictive check for control group")

ppd_intervals(t(t(y0_pred[ ,control_ind])-y[control_ind]), x=y[control_ind]) + geom_abline(intercept = 0, slope = 0)  +
  ggplot2::labs(y = "Errors in predicted y's", x = "Observed y's", title = "Read: Predictive check for control group")

calculate CATE - doubly-robust estimator

group1_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==1)
group2_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==1)
group3_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==1)
group4_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==1)
group5_ind <- (X$gender == 1) & (X$family_class == 1) & (X$tech_access ==0)
group6_ind <- (X$gender == 0) & (X$family_class == 1) & (X$tech_access ==0)
group7_ind <- (X$gender == 1) & (X$family_class == 0) & (X$tech_access ==0)
group8_ind <- (X$gender == 0) & (X$family_class == 0) & (X$tech_access ==0)

math

y <- data$math

bayesian linear regression

y0_pred <- exp(extract(math_outcome_control_BLR)$y_rep)
y1_pred <- exp(extract(math_outcome_treat_BLR)$y_rep)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_math_dr_BLR <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_math_dr_BLR) <-  c("female, middle, tech", 
                          "male, middle, tech", 
                          "female, working, tech", 
                          "male, working, tech", 
                          "female, middle, no_tech", 
                          "male, middle, no_tech", 
                          "female, working, no_tech", 
                          "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_math_dr_BLR, horizontal=TRUE, las=1, main="Math: CATE (with BLR)")

BART

y0_pred <- exp(math_outcome_control_BART$yhat.test)
y1_pred <- exp(math_outcome_treat_BART$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_math_dr_BART <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_math_dr_BART) <-  c("female, middle, tech", 
                          "male, middle, tech", 
                          "female, working, tech", 
                          "male, working, tech", 
                          "female, middle, no_tech", 
                          "male, middle, no_tech", 
                          "female, working, no_tech", 
                          "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_math_dr_BART, horizontal=TRUE, las=1, main="Math: CATE (with BART)")

#boxplot(cate, las=2)

read

y <- data$read

bayesian linear regression

y0_pred <- exp(extract(read_outcome_control_BLR)$y_rep)
y1_pred <- exp(extract(read_outcome_treat_BLR)$y_rep)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_read_dr_BLR <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_read_dr_BLR) <-  c("female, middle, tech", 
                          "male, middle, tech", 
                          "female, working, tech", 
                          "male, working, tech", 
                          "female, middle, no_tech", 
                          "male, middle, no_tech", 
                          "female, working, no_tech", 
                          "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_read_dr_BLR, horizontal=TRUE, las=1, main="Read: CATE (with BLR)")

BART

y0_pred <- exp(read_outcome_control_BART$yhat.test)
y1_pred <- exp(read_outcome_treat_BART$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_read_dr_BART <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_read_dr_BART) <- c("female, middle, tech", 
                          "male, middle, tech", 
                          "female, working, tech", 
                          "male, working, tech", 
                          "female, middle, no_tech", 
                          "male, middle, no_tech", 
                          "female, working, no_tech", 
                          "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_read_dr_BART, horizontal=TRUE, las=1, main="Read: CATE (with BART)")

#boxplot(cate, las=2)

science

y <- data$science

Bayesian Linear Regression

y0_pred <- exp(extract(sci_outcome_control_BLR)$y_rep)
y1_pred <- exp(extract(sci_outcome_treat_BLR)$y_rep)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_sci_dr_BLR <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_read_dr_BLR) <-  c("female, middle, tech", 
                          "male, middle, tech", 
                          "female, working, tech", 
                          "male, working, tech", 
                          "female, middle, no_tech", 
                          "male, middle, no_tech", 
                          "female, working, no_tech", 
                          "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_sci_dr_BLR, horizontal=TRUE, las=1, main="Science: CATE (with BLR)")

BART

y0_pred <- exp(sci_outcome_control_BART$yhat.test)
y1_pred <- exp(sci_outcome_treat_BART$yhat.test)

causal_contrasts <- dr_estimate(y, t, y0_pred, y1_pred, ps)

cate1_dr <- rowMeans(causal_contrasts[, group1_ind])
cate2_dr <- rowMeans(causal_contrasts[, group2_ind])
cate3_dr <- rowMeans(causal_contrasts[, group3_ind])
cate4_dr <- rowMeans(causal_contrasts[, group4_ind])
cate5_dr <- rowMeans(causal_contrasts[, group5_ind])
cate6_dr <- rowMeans(causal_contrasts[, group6_ind])
cate7_dr <- rowMeans(causal_contrasts[, group7_ind])
cate8_dr <- rowMeans(causal_contrasts[, group8_ind])
cate_science_dr_BART <- data.frame(cate1_dr, cate2_dr, cate3_dr, cate4_dr, cate5_dr, cate6_dr, cate7_dr, cate8_dr)
colnames(cate_science_dr_BART) <- c("female, middle, tech", 
                               "male, middle, tech", 
                               "female, working, tech", 
                               "male, working, tech", 
                               "female, middle, no_tech", 
                               "male, middle, no_tech", 
                               "female, working, no_tech", 
                               "male, working, no_tech")
par(mar = c(4, 11, 4, 2))
boxplot(cate_science_dr_BART, horizontal=TRUE, las=1, main="Science: CATE (with BART)")

#boxplot(cate, las=2)

some visualizations

hist(X_treat$wealth, col = rgb(0, 0, 1, 0.5), xlim = c(-7, 7), ylim=c(0, 1600),
     main = "wealth", xlab = "wealth", breaks = 30)
hist(X_control$wealth, col = rgb(1, 0, 0, 0.5), add = TRUE, breaks = 30)

legend("topright", legend = c("private", "public"), 
       fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5)))

hist(X_treat$wealth, col = rgb(0, 0, 1, 0.5), xlim = c(-7, 7), ylim=c(0, 1600),
     main = "wealth", xlab = "wealth", breaks = 30)
hist(X_control$wealth, col = rgb(1, 0, 0, 0.5), add = TRUE, breaks = 30)

legend("topright", legend = c("private", "public"), 
       fill = c(rgb(0, 0, 1, 0.5), rgb(1, 0, 0, 0.5)))

hist(X_treat$escs, col = rgb(0.2, 0.8, 0.2, 0.5), xlim = c(-7, 7), ylim=c(0, 700),
     main = "escs", xlab = "escs", breaks = 30)
hist(X_control$escs, col = rgb(1, 0.6, 0, 0.5), add = TRUE, breaks = 30)

legend("topright", legend = c("private", "public"), 
       fill = c(rgb(0.2, 0.8, 0.2, 0.5), rgb(1, 0.6, 0, 0.5)))

colors = c(rgb(0, 0, 1, 0.5), rgb(0, 1, 1, 0.5))
variables <- c("gender", "family_class", "tech_access")
group <- c("private", "public")
 
# Create the matrix of the values.
Values <- matrix(c(sum(X_treat$gender)/nrow(X_treat), 
                   sum(X_treat$family_class)/nrow(X_treat), 
                   sum(X_treat$tech_access)/nrow(X_treat),
                   sum(X_control$gender)/nrow(X_control), 
                   sum(X_control$family_class)/nrow(X_control),
                   sum(X_control$tech_access)/nrow(X_control)),
                 nrow = 2, ncol = 3, byrow = TRUE)
 
# Create the bar chart
barplot(Values, main = "gender, family_class, tech_access", names.arg = variables,
                        xlab = "variables", ylab = "percentage",
                        col = colors, beside = TRUE)
 
# Add the legend to the chart
legend("topleft", group, cex = 0.7, fill = colors)

num <- c(sum(group1_ind),
         sum(group2_ind),
         sum(group3_ind),
         sum(group4_ind),
         sum(group5_ind),
         sum(group6_ind),
         sum(group7_ind),
         sum(group8_ind))
label <- c("female, middle, tech", 
           "male, middle, tech", 
           "female, working, tech", 
           "male, working, tech", 
           "female, middle, no_tech", 
           "male, middle, no_tech", 
           "female, working, no_tech", 
           "male, working, no_tech")
par(mar = c(0.5, 0.5, 0.5, 0.5))
pie(num, label, radius=0.8)